Reading in Data

library(ggplot2)
pf <- read.csv("pseudo_facebook.tsv", sep = "\t")

Scatterplots

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point()

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00

What are some things that you notice right away?

  • Younger users have a lot of friends
  • There are some vertical bars where people have lied about their age, like 69 and 100.

Limit x

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point() + xlim(13, 90)
## Warning: Removed 4906 rows containing missing values (geom_point).


Overplotting

Notes: The bulk of data lies below the 1000 threshold for friend count. Using “jitter”, we can add some noise to each age so we get a clearer picture of the relationship between age and friend count (as age is a continuous variable)

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_jitter(alpha = 1/20) +
  xlim(13, 90)
## Warning: Removed 5169 rows containing missing values (geom_point).

What do you notice in the plot?

  • The friend counts for young users aren’t nearly as high as they looked before.

Coord_trans()

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point(alpha = 1/20) +
  xlim(13, 90) +
  coord_trans(y = "sqrt")
## Warning: Removed 4906 rows containing missing values (geom_point).

What do you notice?

  • It’s much easier to see the distribution of friend count, conditional, and age.
  • We removed jitter, because if we add noise to people with 0 friends, we might end up with negative numbers and those sqrt would be imaginary.

Coord_trans() with jitter solution

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point(alpha = 1/20, position = position_jitter(h = 0)) +
  xlim(13, 90) +
  coord_trans(y = "sqrt")
## Warning: Removed 5198 rows containing missing values (geom_point).


Alpha and Jitter

Explore the relationship between friends initiated vs age.

ggplot(data = pf, aes(x = age, y = friendships_initiated)) +
  geom_point(alpha = 1/10, position = position_jitter(h = 0)) +
  xlim(13, 90) +
  coord_trans(y = "sqrt")
## Warning: Removed 5179 rows containing missing values (geom_point).


Conditional Means

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups, 
          friend_count_mean = mean(friend_count), 
          friend_count_median = median(friend_count),
          n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
## # A tibble: 6 x 4
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196

Create your plot!

ggplot(data = pf.fc_by_age, aes(age, friend_count_mean)) +
  geom_line()


Overlaying Summaries with Raw Data

ggplot(aes(x = age, y = friend_count), data = pf) +
  coord_cartesian(xlim = c(13, 70), ylim = c(0, 1000)) +
  geom_point(alpha = 0.05,
             position = position_jitter(h = 0),
             color = "orange") +
  geom_line(stat = "summary", fun.y = mean) +
  geom_line(stat = "summary", fun.y = quantile,
            fun.args = list(probs = .1),
            linetype = 2, color = "blue") +
  geom_line(stat = "summary", fun.y = quantile,
          fun.args = list(probs = .5),
          color = "blue") +
  geom_line(stat = "summary", fun.y = quantile,
          fun.args = list(probs = .9),
          linetype = 2, color = "blue")

What are some of your observations of the plot?

  • Having more than 1000 friends is quite rare
  • 35-60 years old friend count falls below 250

Correlation

cor.test(pf$age, pf$friend_count, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

What’s the correlation between age and friend count? Round to three decimal places.


Correlation on Subsets

with(subset(pf, age <= 70), cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245

Correlation Methods

Pearson, Kendall, Spearman


Create Scatterplots

ggplot(data = pf, aes(www_likes_received, likes_received)) +
  geom_point()


Strong Correlations

ggplot(data = pf, aes(www_likes_received, likes_received)) +
  geom_point() +
  xlim(0, quantile(pf$www_likes_received, 0.95)) +
  ylim(0, quantile(pf$likes_received, 0.95)) +
  geom_smooth(method = "lm", color = "red")
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

with(pf, cor.test(www_likes_received, likes_received))
## 
##  Pearson's product-moment correlation
## 
## data:  www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902
  • Correlation = 0.948
  • The correlation was actually an artifact of the nature of the variables as one of them is superset of the other.

Moira on Correlation

  • Prefer independent variables to each other, so as to find the one which drives the phenomenon.
  • First measure the correlation between our variables and then determine which ones you don’t actually want to throw in togeher ==> decide which to keep.

More Caution with Correlation

# install.packages('alr3')
library(alr3)
## Loading required package: car
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
data(Mitchell)
?Mitchell
head(Mitchell)
##   Month     Temp
## 1     0 -5.18333
## 2     1 -1.65000
## 3     2  2.49444
## 4     3 10.40000
## 5     4 14.99440
## 6     5 21.71670

Create your plot!

ggplot(data = Mitchell, aes(x = Month, y = Temp)) +
  geom_point()


Noisy Scatterplots

  1. Take a guess for the correlation coefficient for the scatterplot.
  • Correlation = 0.1
  1. What is the actual correlation of the two variables?
  • Correlation = 0.057
with(Mitchell, cor.test(Month, Temp))
## 
##  Pearson's product-moment correlation
## 
## data:  Month and Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Seems like a pretty weak correlation.


Making Sense of Data

range(Mitchell$Month)
## [1]   0 203
ggplot(data = Mitchell, aes(x = Month, y = Temp)) +
  geom_point() +
  scale_x_continuous(breaks = seq(0, 203, 12))


A New Perspective

What do you notice?

  • A cyclical pattern

Watch the solution video and check out the Instructor Notes!

  • The cor and cor.test functions determine the strength of a linear relationship, but they may miss other relationships in the data.
ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell)+ 
  geom_point()

# install.packages("energy")
library(energy)
with(Mitchell, dcor.ttest(Month, Temp))
## 
##  dcor t-test of independence
## 
## data:  Month and Temp
## T = -0.93904, df = 20501, p-value = 0.8261
## sample estimates:
## Bias corrected dcor 
##        -0.006558215

Understanding Noise: Age to Age Months

ggplot(data = pf.fc_by_age, aes(age, friend_count_mean)) +
  geom_line()

head(pf.fc_by_age, 10)
## # A tibble: 10 x 4
##      age friend_count_mean friend_count_median     n
##    <int>             <dbl>               <dbl> <int>
##  1    13          164.7500                74.0   484
##  2    14          251.3901               132.0  1925
##  3    15          347.6921               161.0  2618
##  4    16          351.9371               171.5  3086
##  5    17          350.3006               156.0  3283
##  6    18          331.1663               162.0  5196
##  7    19          333.6921               157.0  4391
##  8    20          283.4991               135.0  3769
##  9    21          235.9412               121.0  3671
## 10    22          211.3948               106.0  3032
pf.fc_by_age[17:19, ]
## # A tibble: 3 x 4
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    29          120.8182                66.0  1936
## 2    30          115.2080                67.5  1716
## 3    31          118.4599                63.0  1694
pf$age_with_months <- pf$age + (1 - pf$dob_month / 12)
# Alternative solution
pf$age_with_months <- with(pf, age + (1 - dob_month / 12))

Age with Months Means

pf.fc_by_age_months <- pf %>% 
  group_by(age_with_months) %>%
  summarise(friend_count_mean = mean(friend_count), 
          friend_count_median = median(friend_count),
          n = n()) %>%
  arrange(age_with_months)

head(pf.fc_by_age_months)
## # A tibble: 6 x 4
##   age_with_months friend_count_mean friend_count_median     n
##             <dbl>             <dbl>               <dbl> <int>
## 1        13.16667          46.33333                30.5     6
## 2        13.25000         115.07143                23.5    14
## 3        13.33333         136.20000                44.0    25
## 4        13.41667         164.24242                72.0    33
## 5        13.50000         131.17778                66.0    45
## 6        13.58333         156.81481                64.0    54

Noise in Conditional Means

ggplot(data = subset(pf.fc_by_age_months, age_with_months < 71),
       aes(age_with_months, friend_count_mean)) +
  geom_line()


Smoothing Conditional Means

p1 <- ggplot(aes(x = age, y = friend_count_mean),
             data = subset(pf.fc_by_age, age < 71)) +
  geom_line() +
  geom_smooth()
p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean),
             data = subset(pf.fc_by_age_months, age_with_months < 71)) +
  geom_line() +
  geom_smooth()
p3 <- ggplot(aes(x = round(age / 5) * 5, y = friend_count),
             data = subset(pf, age < 71)) +
  geom_line(stat = "summary", fun.y = mean)

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p1, p2, p3, ncol = 1)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'


Which Plot to Choose?

You don’t have to choose. In exploratory data analysis, we often create multiple visualisations and summaries of the same data. Each graph or summary may reveal different things about the same data.


Analyzing Two Variables

Reflection

  • Explore relationship between two varialbes.
  • Main visualising tool used was the scatterplot.
  • Augmented the scatterplot with conditional summaries.
  • Correlation - Benefits and limitations
  • How correlation may effect your decisions over which variables to include in your final models.
  • How to make sense of data through adjusting our visualisations.
  • How to use jitter and transparency to reduce over plotting.

Load diamonds data set.

data("diamonds")
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

Create a scatterplot of price vs x.

ggplot(data = diamonds, aes(x = x, y = price)) +
  geom_point()

  • There are some outliers.
  • There is an exponential relationship between price and x.

Correlations

with(diamonds, cor.test(price, x))
## 
##  Pearson's product-moment correlation
## 
## data:  price and x
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8825835 0.8862594
## sample estimates:
##       cor 
## 0.8844352
with(diamonds, cor.test(price, y))
## 
##  Pearson's product-moment correlation
## 
## data:  price and y
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8632867 0.8675241
## sample estimates:
##       cor 
## 0.8654209
with(diamonds, cor.test(price, z))
## 
##  Pearson's product-moment correlation
## 
## data:  price and z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8590541 0.8634131
## sample estimates:
##       cor 
## 0.8612494

Create a scatterplot of price vs depth.

ggplot(data = diamonds, aes(x = x, y = depth)) +
  geom_point()

Make some adjustments - price vs. depth.

range(diamonds$x)
## [1]  0.00 10.74
ggplot(data = diamonds, aes(x = x, y = depth)) +
  geom_point(alpha = 1/100) +
  scale_x_continuous(breaks=seq(0,11,2))

Correlation between price and depth.

with(diamonds, cor.test(price, depth))
## 
##  Pearson's product-moment correlation
## 
## data:  price and depth
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.019084756 -0.002208537
## sample estimates:
##        cor 
## -0.0106474
  • Based on this pretty weak correlation we cannot predict the price of a diamond using depth

Create a scatterplot of price vs carat and omit the top 1% of price and carat values.

ggplot(data = subset(diamonds, diamonds$price < quantile(diamonds$price, 0.99) &
                       diamonds$carat < quantile(diamonds$carat, 0.99)),
       aes(x = price, y = carat)) +
  geom_point(alpha = 1/20)

Create a scatterplot of price vs. volume (x * y * z).

diamonds$volume <- diamonds$x * diamonds$y * diamonds$z
ggplot(data = diamonds, aes(x = price, y = volume)) +
  geom_point()

  • There are some outliers.
  • There is linear relationship between price and volume.
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
count(diamonds$volume == 0)
##       x  freq
## 1 FALSE 53920
## 2  TRUE    20

Correlation of price and volume.

Exclude diamonds that have a volume greater than or equal to 800 and diamonds that have a volume of 0.

with(subset(diamonds, !(volume == 0 | volume >= 800)), cor.test(price, volume))
## 
##  Pearson's product-moment correlation
## 
## data:  price and volume
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9222944 0.9247772
## sample estimates:
##       cor 
## 0.9235455
# Or
with(subset(diamonds, volume > 0 & volume < 800), cor.test(price, volume))
## 
##  Pearson's product-moment correlation
## 
## data:  price and volume
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9222944 0.9247772
## sample estimates:
##       cor 
## 0.9235455

Adjustments - price vs. volume

Subset the data to exclude diamonds with a volume greater than or equal to 800 and diamonds with a volume of 0. Adjust the transparency of the points and add a linear model to the plot.

diamonds_0_800 <- subset(diamonds, volume > 0 & volume < 800)
ggplot(data = diamonds_0_800, aes(x = price, y = volume)) +
  geom_point(alpha = 1/10, color = "yellow") +
  geom_smooth(method = "lm", formula = y ~ x, size = 0.5) +
  geom_smooth(method = "lm", formula = y ~ poly(x,2), size = 1, color = "green") +
  geom_smooth(method = "lm", formula = y ~ poly(x,3), size = 1, color = "pink") +
  geom_smooth(method = "gam", formula = y ~ s(x), size = 1, color = "violet") +
  geom_smooth(method = "gam", formula = y ~ s(x, k = 3), size = .5, color = "red") +
  coord_cartesian(ylim = c(0, 400))

Note: If you used the count() function from the plyr package before this exercise. You need to run this command to unload the plyr package.

detach("package:plyr", unload=TRUE)
## Warning: 'plyr' namespace cannot be unloaded:
##   namespace 'plyr' is imported by 'scales', 'ggplot2' so cannot be unloaded

Mean, median, min and max price by carity

diamondsByClarity <- diamonds %>% 
  group_by(clarity) %>%
  summarise(mean_price = mean(price), 
          median_price = median(price),
          min_price = min(price),
          max_price = max(price),
          n = n()) %>%
  arrange(clarity)

head(diamondsByClarity)
## # A tibble: 6 x 6
##   clarity mean_price median_price min_price max_price     n
##     <ord>      <dbl>        <dbl>     <int>     <int> <int>
## 1      I1   3924.169         3344       345     18531   741
## 2     SI2   5063.029         4072       326     18804  9194
## 3     SI1   3996.001         2822       326     18818 13065
## 4     VS2   3924.989         2054       334     18823 12258
## 5     VS1   3839.455         2005       327     18795  8171
## 6    VVS2   3283.737         1311       336     18768  5066

Mean price by color

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))

Bar Charts of Mean Price

Notes: There are two types of bar charts, determined by what is mapped to bar height. By default, geom_bar uses stat=“count” which makes the height of the bar proportion to the number of cases in each group (or if the weight aesthetic is supplied, the sum of the weights). If you want the heights of the bars to represent values in the data, use stat=“identity” and map a variable to the y aesthetic.

p1 <- ggplot(aes(clarity, mean_price), data = diamondsByClarity) +
geom_bar(stat = 'identity')
p2 <- ggplot(aes(color, mean_price), data = diamonds_mp_by_color) +
geom_bar(stat = 'identity')

grid.arrange(p1, p2, ncol = 1)